In [1]:
import keras
In [2]:
keras.__version__
Out[2]:
In [3]:
from keras.layers import Embedding
# Number of maximum tokens is equal of maximum word index + 1
max_number_of_tokens = 1000
embedding_dimentionality = 64
embedding_layer = Embedding(max_number_of_tokens, embedding_dimentionality)
The layers transforms a 2D input tensor of integer of shape (number_of samples, sequence_length) into a 3D floating point tensor, of shape (number_of_samples, sequence_length, embedding_dimensionality.) Such tensor can be processed a RNN layer of a 1D convolutional layer.
In [4]:
from keras.datasets import imdb
from keras.preprocessing.sequence import pad_sequences
In [5]:
# Number of words considered as features
max_features = 10000
In [6]:
# Cutting reviews after only 20 words
sequence_max_length = 20
In [7]:
# Loading data
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = max_features)
In [8]:
x_train.shape
Out[8]:
In [9]:
x_train_sequence = pad_sequences(x_train, maxlen = sequence_max_length)
In [10]:
x_train_sequence.shape
Out[10]:
In [11]:
x_train[0:2]
Out[11]:
In [12]:
x_train[0].__getitem__(-20)
Out[12]:
In [13]:
x_train_sequence[0, :]
Out[13]:
In [14]:
x_train_sequence[0]
Out[14]:
In [15]:
x_train_sequence[1]
Out[15]:
In [16]:
from keras.models import Sequential
from keras.layers import Flatten, Dense
In [17]:
model = Sequential()
model.add(Embedding(input_dim = max_features, output_dim = 8, input_length = sequence_max_length))
model.add(Flatten())
model.add(Dense(units = 1, activation = 'sigmoid'))
In [18]:
# Compiling the model
model.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['acc'])
In [19]:
model.summary()
In [20]:
# Training
history = model.fit(x = x_train_sequence,
y = y_train,
epochs = 10,
batch_size = 32,
validation_split = 0.2)
The data can be downloaded from: http://mng.bz/0tIo
In [21]:
import os
In [22]:
imdb_dir = './data/Chapter 6.1.2 - Using word embeddings/aclImdb/'
In [23]:
train_dir = os.path.join(imdb_dir, 'train')
In [24]:
labels = []
texts = []
for label_type in ['neg', 'pos']:
dir_name = os.path.join(train_dir, label_type)
for fname in os.listdir(dir_name):
# Taking into consideration files which are only .txt
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname), encoding="utf8")
texts.append(f.read())
f.close()
if label_type == 'neg':
labels.append(0)
else:
labels.append(1)
In [25]:
len(labels)
Out[25]:
In [26]:
len(texts)
Out[26]:
In [27]:
texts[0]
Out[27]:
In [28]:
labels[0]
Out[28]:
In [29]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
In [30]:
# Using only first 100 words of each review
maxlen = 100
In [31]:
# Number of training samples
training_samples = 200
In [32]:
# Number of validation samples
validation_samples = 10000
In [33]:
# Tokenizing only top 10 000 words in the dataset.
max_words = 10000
In [34]:
# Initializing Tokenizer
tokenizer = Tokenizer(num_words = max_words)
In [35]:
# Fitting the Tokenizer on the text
tokenizer.fit_on_texts(texts)
In [36]:
# Text to sequence
sequences = tokenizer.texts_to_sequences(texts)
In [37]:
sequences[0:2]
Out[37]:
In [38]:
# Word index
word_index = tokenizer.word_index
In [39]:
type(word_index)
Out[39]:
In [40]:
first10pairs = {k: word_index[k] for k in list(word_index)[:10]}
In [41]:
first10pairs
Out[41]:
In [42]:
# Padding the sequence
data = pad_sequences(sequences, maxlen = maxlen)
In [43]:
data.shape
Out[43]:
In [44]:
labels = np.asarray(labels)
In [45]:
labels.shape
Out[45]:
In [46]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
In [47]:
# Splitting the data into train and validation datasets
x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]
In [48]:
x_train.shape
Out[48]:
In [49]:
x_val.shape
Out[49]:
Download from: http://nlp.stanford.edu/data/glove.6B.zip
In [50]:
# Importing tqdm to show a progress bar
from tqdm import tqdm
In [51]:
glove_dir = './data/Chapter 6.1.2 - Using word embeddings/glove.6B/'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'),
encoding = 'utf-8')
for line in tqdm(f):
values = line.split()
word = values[0]
coefs = np.asarray(values[1:],
dtype = 'float32')
embeddings_index[word] = coefs
f.close()
In [52]:
len(embeddings_index)
Out[52]:
In [53]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
if i < max_words:
embedding_vector = embeddings_index.get(word)
# Words not found in the embedding index will be represented as zeros
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
In [54]:
embedding_matrix
Out[54]:
In [55]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
In [56]:
model = Sequential()
model.add(Embedding(input_dim = max_words,
output_dim = embedding_dim,
input_length = maxlen))
model.add(Flatten())
model.add(Dense(units = 32,
activation = 'relu'))
model.add(Dense(units = 1,
activation = 'sigmoid'))
model.summary()
In [57]:
# Loading pretrained word embeddings
model.layers[0].set_weights([embedding_matrix])
# Freezing the layer
model.layers[0].trainable = False
In [58]:
model.compile(optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = ['acc'])
In [59]:
history = model.fit(x = x_train,
y = y_train,
epochs = 10,
batch_size = 32,
validation_data = (x_val, y_val))
In [60]:
model.save_weights('./saved_checkpoints/Chapter 6.1.2 - Using word embeddings/pre_trained_glove_model.h5')
In [61]:
import matplotlib.pyplot as plt
In [62]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
The model overfits very quickly.
In [63]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['acc'])
history = model.fit(x_train, y_train,
epochs=10,
batch_size=32,
validation_data=(x_val, y_val))
In [64]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()